UPDATE THIS
I'll test two basic classifiers: a RandomForest classifier and a Logistic Regression classifier. I'll try a very simplistic weak-learner: a straight i-band cut.
For my training data, I started by getting objects and labels from COSMOS. For input features, I then matched those COSMOS galaxies to their nearest HSC counterpart. I then used HSC i-band magnitude, along with HSC g-r, r-i, i-z, z-y colors. Finally, I augment it with some HSC photo-z information (FRANKEN-Z
).
In this notebook I'll look at the full decision curves for the classifiers, in hopes of better understanding my results.
In [1]:
# give access to importing dwarfz
import os, sys
dwarfz_package_dir = os.getcwd().split("dwarfz")[0]
if dwarfz_package_dir not in sys.path:
sys.path.insert(0, dwarfz_package_dir)
import dwarfz
# back to regular import statements
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(context="poster", style="ticks", font_scale=1.4)
import numpy as np
import pandas as pd
from scipy.special import expit
import sklearn
import sklearn.metrics
import pathlib
In [2]:
%env HDF5_DISABLE_VERSION_CHECK=1
In [3]:
import matplotlib as mpl
mpl.rcParams['savefig.dpi'] = 80
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['figure.figsize'] = 2*np.array((8,6))
mpl.rcParams['figure.facecolor'] = "white"
In [4]:
COSMOS_filename = pathlib.Path(dwarfz.data_dir_default) / "COSMOS_reference.sqlite"
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)
In [5]:
HSC_filename = pathlib.Path(dwarfz.data_dir_default) / "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)
In [6]:
matches_filename = pathlib.Path(dwarfz.data_dir_default) / "matches.sqlite3"
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)
In [7]:
combined = matches_df[matches_df.match].copy()
combined["ra"] = COSMOS.df.loc[combined.index].ra
combined["dec"] = COSMOS.df.loc[combined.index].dec
combined["photo_z"] = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med
photometry_cols = [
"gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags", "gcmodel_mag",
"rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags", "rcmodel_mag",
"icmodel_flux","icmodel_flux_err","icmodel_flux_flags", "icmodel_mag",
"zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags", "zcmodel_mag",
"ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags", "ycmodel_mag",
]
for col in photometry_cols:
combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values
In [8]:
combined["g_minus_r"] = combined.gcmodel_mag - combined.rcmodel_mag
combined["r_minus_i"] = combined.rcmodel_mag - combined.icmodel_mag
combined["i_minus_z"] = combined.icmodel_mag - combined.zcmodel_mag
combined["z_minus_y"] = combined.zcmodel_mag - combined.ycmodel_mag
In [9]:
mask = np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
& np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
& np.isfinite(combined["icmodel_mag"]) \
& (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
& (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
& (~combined.ycmodel_flux_flags)
combined = combined[mask]
In [10]:
df_frankenz = pd.read_sql_table("photo_z",
"sqlite:///{}".format(
pathlib.Path(dwarfz.data_dir_default)
/ "HSC_matched_to_FRANKENZ.sqlite"),
index_col="object_id")
df_frankenz.head()
Out[10]:
In [11]:
combined = combined.join(df_frankenz[["photoz_best", "photoz_risk_best"]],
on="catalog_2_ids")
In [12]:
images_dir = pathlib.Path.home() / "dwarfz" / "galaxies_narrowband"
HSC_ids_target = {int(subdir.name)
for subdir in images_dir.glob("target/*")
if subdir.name[0] != "."}
HSC_ids_contaminant = {int(subdir.name)
for subdir in images_dir.glob("contaminant/*")
if subdir.name[0] != "."}
remove_ids = HSC_ids_target & HSC_ids_contaminant
HSC_ids_target -= remove_ids
HSC_ids_contaminant -= remove_ids
HSC_ids = HSC_ids_target | HSC_ids_contaminant
HSC_ids_target = np.array(list(HSC_ids_target))
HSC_ids_contaminant = np.array(list(HSC_ids_contaminant))
HSC_ids = np.array(list(HSC_ids))
print(len(HSC_ids_target))
print(len(HSC_ids_contaminant))
print(len(HSC_ids), len(HSC_ids_target) + len(HSC_ids_contaminant))
(~np.isin(HSC_ids, combined.catalog_2_ids)).sum()
HSC_ids = np.array(sorted(HSC_ids)) # enforce a deterministic ordering
print(HSC_ids.shape)
In [13]:
df = pd.DataFrame(data={
"HSC_id" :HSC_ids,
"target" : [HSC_id in HSC_ids_target for HSC_id in HSC_ids]
})
df = df.set_index("HSC_id")
df.head()
Out[13]:
In [14]:
cosmos_ids = set(pd.read_csv("target_galaxies-HSC_ids.csv", dtype=int).COSMOS_id)
cosmos_ids |= set(pd.read_csv("contaminant_galaxies-HSC_ids.csv", dtype=int).COSMOS_id)
combined = combined[combined.index.isin(cosmos_ids)]
combined = combined[combined.catalog_2_ids.isin(HSC_ids)]
# drop duplicatee HSC ids:
from collections import Counter
Counter(combined.catalog_2_ids)
for hsc_id, num_matches in Counter(combined.catalog_2_ids).most_common():
if num_matches == 1:
break
combined = combined[~(combined.catalog_2_ids==hsc_id)]
mask_HSC_id_in_combined_filtered = np.isin(HSC_ids, combined.catalog_2_ids)
HSC_ids = HSC_ids[mask_HSC_id_in_combined_filtered].copy()
print(combined.shape)
combined.head()
Out[14]:
In [15]:
HSC_ids_target
Out[15]:
In [16]:
combined["target"] = combined.catalog_2_ids.isin(set(HSC_ids_target)).values
combined["target"].mean()
Out[16]:
In [17]:
df_vgg = pd.read_hdf("vgg_features.h5")
df_vgg.vgg_features = df_vgg.vgg_features.apply(lambda arr: arr.flatten())
df_vgg = df_vgg.set_index("HSC_id").loc[HSC_ids]
print(df_vgg.shape)
df_vgg.head()
Out[17]:
In [18]:
features = combined.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
"icmodel_mag",
"photoz_best",
"photoz_risk_best" # The risk of photoz_best being outside of the range z_true +- 0.15(1+z_true). It ranges from 0 (safe) to 1(risky)
]]
target = combined.loc[:,["target"]]
In [19]:
features_combined = np.hstack([features.values,
np.array([*df_vgg.vgg_features.values])])
print(features_combined.shape)
In [20]:
target.mean()
Out[20]:
In [21]:
testing_fraction = .2
np.random.seed(1)
ids_shuffled = np.random.permutation(range(HSC_ids.size))
N_testing_indices = int(testing_fraction*ids_shuffled.size)
ids_testing = ids_shuffled[:N_testing_indices]
ids_training = ids_shuffled[N_testing_indices:]
features_train = features_combined[ids_training]
features_test = features_combined[ids_testing]
target_train = target.iloc[ids_training]
target_test = target.iloc[ids_testing]
In [22]:
color_RF = "g"
color_RF_with_vgg = "b"
color_RF_with_vgg_more = "orange"
color_CNN = "purple"
color_CNN_no_phot = "#069af3"
color_photoz = "#0165fc"
color_RF_no_photoz = sns.color_palette("hls", 8)[-1]
label_RF = "Random Forest"
label_RF_with_vgg = "RF (with VGG-19)"
label_RF_with_vgg_more = "RF (with VGG-19)\n$n_\mathrm{split\:features}=1024$"
label_CNN = "CNN"
label_CNN_no_phot = "CNN (no photometry)"
label_photoz = r"LR: only photo-$z$"
label_RF_no_photoz = r"RF: no photo-$z$"
linewidth=4
In [23]:
threshold_probs = expit(np.linspace(-9, 6, num=1000))
threshold_probs = np.array([0, *threshold_probs, 1])
threshold_probs
Out[23]:
In [24]:
from sklearn.ensemble import RandomForestClassifier
features_train_tmp = features_train[:,:7]
features_test_tmp = features_test[:,:7]
class_probs_filename_tmp = "class_probs.RF.h5"
recompute = False
if recompute or (not pathlib.Path(class_probs_filename_tmp).is_file()):
classifier_RF = RandomForestClassifier(n_estimators=1000, n_jobs=4)
classifier_RF = classifier_RF.fit(features_train_tmp, target_train.values.flatten())
target_prob_RF = classifier_RF.predict_proba(features_test_tmp)[:,1]
target_prob_RF = (target_prob_RF*classifier_RF.n_estimators + 1) / (2+classifier_RF.n_estimators)
df_RF = pd.DataFrame(data={
"HSC_id": HSC_ids[ids_testing],
"target" : target_test.values.flatten(),
"prob": target_prob_RF,
})
df_RF.to_hdf(class_probs_filename_tmp, "data")
else:
df_RF = pd.read_hdf(class_probs_filename_tmp)
print("min prob: ", df_RF.prob.min())
print("max prob: ", df_RF.prob.max())
In [27]:
from sklearn.ensemble import RandomForestClassifier
class_probs_filename_tmp = "class_probs.RF_with_vgg.h5"
recompute = False
if recompute or (not pathlib.Path(class_probs_filename_tmp).is_file()):
classifier_RF_with_vgg = RandomForestClassifier(n_estimators=3000, n_jobs=8,
# max_features=1024,
)
classifier_RF_with_vgg = classifier_RF_with_vgg.fit(features_train, target_train.values.flatten())
target_prob_RF_with_vgg = classifier_RF_with_vgg.predict_proba(features_test)[:,1]
target_prob_RF_with_vgg = (target_prob_RF_with_vgg*classifier_RF_with_vgg.n_estimators + 1) / (2+classifier_RF_with_vgg.n_estimators)
df_RF_with_vgg = pd.DataFrame(data={
"HSC_id": HSC_ids[ids_testing],
"target" : target_test.values.flatten(),
"prob": target_prob_RF_with_vgg,
})
df_RF_with_vgg.to_hdf(class_probs_filename_tmp, "data")
else:
df_RF_with_vgg = pd.read_hdf(class_probs_filename_tmp)
print("min prob: ", df_RF_with_vgg.prob.min())
print("max prob: ", df_RF_with_vgg.prob.max())
In [28]:
from sklearn.ensemble import RandomForestClassifier
class_probs_filename_tmp = "class_probs.RF_no_photoz.h5"
recompute = False
if recompute or (not pathlib.Path(class_probs_filename_tmp).is_file()):
keys = [i for i, key in enumerate(features) if "photoz" not in key]
classifier_RF_no_photoz = RandomForestClassifier(n_estimators=1000, n_jobs=4)
classifier_RF_no_photoz = classifier_RF_no_photoz.fit(features_train[:, keys],
target_train.values.flatten())
target_prob_RF_no_photoz = classifier_RF_no_photoz.predict_proba(features_test[:, keys])[:,1]
target_prob_RF_no_photoz = (target_prob_RF_no_photoz*classifier_RF_no_photoz.n_estimators + 1) / (2+classifier_RF_no_photoz.n_estimators)
df_RF_no_photoz = pd.DataFrame(data={
"HSC_id": HSC_ids[ids_testing],
"target" : target_test.values.flatten(),
"prob": target_prob_RF_no_photoz,
})
df_RF_no_photoz.to_hdf(class_probs_filename_tmp, "data")
else:
df_RF_no_photoz = pd.read_hdf(class_probs_filename_tmp)
print("min prob: ", df_RF_no_photoz.prob.min())
print("max prob: ", df_RF_no_photoz.prob.max())
In [29]:
from sklearn.linear_model import LogisticRegression
class_probs_filename_tmp = "class_probs.LR_photoz.h5"
recompute = False
if recompute or (not pathlib.Path(class_probs_filename_tmp).is_file()):
keys = [i for i, key in enumerate(features) if "photoz" in key]
classifier_photoz = LogisticRegression(class_weight=None)
classifier_photoz = classifier_photoz.fit(features_train[:, keys],
np.array(target_train.values.flatten(), dtype=int))
target_prob_photoz = classifier_photoz.predict_proba(features_test[:, keys])[:,1]
df_LR_photoz = pd.DataFrame(data={
"HSC_id": HSC_ids[ids_testing],
"target" : target_test.values.flatten(),
"prob": target_prob_photoz,
})
df_LR_photoz.to_hdf(class_probs_filename_tmp, "data")
else:
df_LR_photoz = pd.read_hdf(class_probs_filename_tmp)
print("min prob: ", df_LR_photoz.prob.min())
print("max prob: ", df_LR_photoz.prob.max())
In [30]:
from sklearn.ensemble import RandomForestClassifier
class_probs_filename_tmp = "class_probs.RF_with_vgg_more.h5"
recompute = False
if recompute or (not pathlib.Path(class_probs_filename_tmp).is_file()):
classifier_RF_with_vgg_more = RandomForestClassifier(n_estimators=3000,
max_features=1024,
n_jobs=6,
)
classifier_RF_with_vgg_more = classifier_RF_with_vgg_more.fit(features_train, target_train.values.flatten())
target_prob_RF_with_vgg_more = classifier_RF_with_vgg_more.predict_proba(features_test)[:,1]
target_prob_RF_with_vgg_more = (target_prob_RF_with_vgg_more*classifier_RF_with_vgg_more.n_estimators + 1) / (2+classifier_RF_with_vgg_more.n_estimators)
df_RF_with_vgg_more = pd.DataFrame(data={
"HSC_id": HSC_ids[ids_testing],
"target" : target_test.values.flatten(),
"prob": target_prob_RF_with_vgg_more,
})
df_RF_with_vgg_more.to_hdf(class_probs_filename_tmp, "data")
else:
df_RF_with_vgg_more = pd.read_hdf(class_probs_filename_tmp)
print("min prob: ", df_RF_with_vgg_more.prob.min())
print("max prob: ", df_RF_with_vgg_more.prob.max())
In [31]:
class_probs_filename = "class_probs.extracted_features.csv"
df_cnn = pd.read_csv(class_probs_filename)
df_cnn = df_cnn.set_index("HSC_id")
df_cnn = df_cnn.loc[HSC_ids]
print(df_cnn[df_cnn.testing].CNN_prob.min())
print(df_cnn[df_cnn.testing].CNN_prob.max())
df_cnn.head()
Out[31]:
In [32]:
class_probs_filename = "class_probs.with_photometry.csv"
df_cnn_with_photometry = pd.read_csv(class_probs_filename)
df_cnn_with_photometry = df_cnn_with_photometry.set_index("HSC_id")
df_cnn_with_photometry = df_cnn_with_photometry.loc[HSC_ids]
print(df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob.min())
print(df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob.max())
df_cnn_with_photometry.head()
Out[32]:
In [33]:
def get_TPR_FPR_recall_precision(predicted_probs_test, target=target_test, threshold_probs=threshold_probs):
TPR = np.empty_like(threshold_probs)
FPR = np.empty_like(threshold_probs)
precision = np.empty_like(threshold_probs)
recall = np.empty_like(threshold_probs)
for i, threshold in enumerate(threshold_probs):
mask = target.values.reshape(-1)
TPR[i] = (predicted_probs_test[ mask] > threshold).mean()
FPR[i] = (predicted_probs_test[~mask] > threshold).mean()
mask = target.values.reshape(-1)
recall[i] = (predicted_probs_test[mask] > threshold).mean()
mask = predicted_probs_test > threshold
precision[i] = target[mask].mean()
return TPR, FPR, recall, precision
In [34]:
(TPR_RF,
FPR_RF,
recall_RF,
precision_RF,) = get_TPR_FPR_recall_precision(df_RF.prob,
target=df_RF.target)
In [35]:
(TPR_RF_with_vgg,
FPR_RF_with_vgg,
recall_RF_with_vgg,
precision_RF_with_vgg,) = get_TPR_FPR_recall_precision(df_RF_with_vgg.prob,
target=df_RF_with_vgg.target)
In [36]:
(TPR_RF_with_vgg_more,
FPR_RF_with_vgg_more,
recall_RF_with_vgg_more,
precision_RF_with_vgg_more,) = get_TPR_FPR_recall_precision(df_RF_with_vgg_more.prob,
target=df_RF_with_vgg_more.target)
In [37]:
(TPR_CNN,
FPR_CNN,
recall_CNN,
precision_CNN, ) = get_TPR_FPR_recall_precision(df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob,
target=df_cnn_with_photometry[df_cnn_with_photometry.testing].target)
In [38]:
(TPR_CNN_no_phot,
FPR_CNN_no_phot,
recall_CNN_no_phot,
precision_CNN_no_phot, ) = get_TPR_FPR_recall_precision(df_cnn[df_cnn.testing].CNN_prob,
target=df_cnn[df_cnn.testing].target)
In [39]:
(TPR_LR_photoz,
FPR_LR_photoz,
recall_LR_photoz,
precision_LR_photoz,) = get_TPR_FPR_recall_precision(df_LR_photoz.prob,
target=df_LR_photoz.target)
In [40]:
(TPR_RF_no_photoz,
FPR_RF_no_photoz,
recall_RF_no_photoz,
precision_RF_no_photoz,) = get_TPR_FPR_recall_precision(df_RF_no_photoz.prob,
target=df_RF_no_photoz.target)
In [44]:
def plot_ROC_curves(with_RF=False, with_RF_with_vgg=False,
with_RF_with_vgg_more=False,
with_CNN=False, with_CNN_no_phot=False,
with_RF_no_photoz=False, with_photoz=False):
if with_RF:
plt.plot(FPR_RF, TPR_RF,
label=label_RF + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_RF.target, df_RF.prob)),
color=color_RF,
linewidth=linewidth,
)
if with_CNN:
plt.plot(FPR_CNN, TPR_CNN,
# label=label_CNN + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_cnn[df_cnn.testing].target, df_cnn[df_cnn.testing].CNN_prob)),
label=label_CNN + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_cnn_with_photometry[df_cnn_with_photometry.testing].target, df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob)),
color=color_CNN,
linewidth=linewidth,
)
if with_CNN_no_phot:
plt.plot(FPR_CNN_no_phot, TPR_CNN_no_phot,
# label=label_CNN + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_cnn[df_cnn.testing].target, df_cnn[df_cnn.testing].CNN_prob)),
label=label_CNN_no_phot + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_cnn[df_cnn.testing].target, df_cnn[df_cnn.testing].CNN_prob)),
color=color_CNN_no_phot,
linewidth=linewidth,
)
if with_RF_no_photoz:
plt.plot(FPR_RF_no_photoz, TPR_RF_no_photoz,
label=label_RF_no_photoz + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_RF_no_photoz.target, df_RF_no_photoz.prob)),
color=color_RF_no_photoz,
linewidth=linewidth,
)
if with_RF_with_vgg_more:
plt.plot(FPR_RF_with_vgg_more, TPR_RF_with_vgg_more,
label=label_RF_with_vgg_more + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_RF_with_vgg_more.target, df_RF_with_vgg_more.prob)),
color=color_RF_with_vgg_more,
linewidth=linewidth,
)
if with_RF_with_vgg:
plt.plot(FPR_RF_with_vgg, TPR_RF_with_vgg,
label=label_RF_with_vgg + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_RF_with_vgg.target, df_RF_with_vgg.prob)),
color=color_RF_with_vgg,
linewidth=linewidth,
)
if with_photoz:
plt.plot(FPR_LR_photoz, TPR_LR_photoz,
label=label_photoz + "\n(AUC={:.2f})".format(sklearn.metrics.roc_auc_score(df_LR_photoz.target, df_LR_photoz.prob)),
color=color_photoz,
linewidth=linewidth,
)
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot([0,1], [0,1], linestyle="dashed", label="Random guessing", color="black",
linewidth=linewidth)
plt.legend(loc="best")
In [45]:
plot_ROC_curves(with_RF=True, with_RF_with_vgg=True,
with_RF_with_vgg_more=True,
with_CNN=True, with_CNN_no_phot=False,
with_RF_no_photoz=True, with_photoz=True)
In [46]:
plot_ROC_curves(
with_RF=True,
with_CNN=True, with_CNN_no_phot=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "ROC-CNN_and_CNN_no_photometry"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [47]:
plot_ROC_curves(
with_RF=True,
with_CNN=True, with_CNN_no_phot=False,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "ROC-RF_and_CNN"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [48]:
plot_ROC_curves(
with_RF=True, with_RF_with_vgg=True,
with_RF_with_vgg_more=True,
with_CNN=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "ROC-RF_and_RFvgg"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [49]:
plot_ROC_curves(
with_RF=True,
with_RF_no_photoz=True,
with_photoz=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "ROC-RF_RFnoz_LRz"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [52]:
def plot_PR_curves(with_RF=False, with_RF_with_vgg=False,
with_RF_with_vgg_more=False,
with_CNN=False, with_CNN_no_phot=False,
with_RF_no_photoz=False, with_photoz=False):
if with_RF:
plt.plot(recall_RF, precision_RF,
label=label_RF + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_RF.target, df_RF.prob)),
color=color_RF,
linewidth=linewidth,
)
if with_CNN:
plt.plot(recall_CNN, precision_CNN,
label=label_CNN + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_cnn_with_photometry[df_cnn_with_photometry.testing].target,
df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob)),
color=color_CNN,
linewidth=linewidth,
)
if with_CNN_no_phot:
plt.plot(recall_CNN_no_phot, precision_CNN_no_phot,
label=label_CNN_no_phot + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_cnn[df_cnn.testing].target,
df_cnn[df_cnn.testing].CNN_prob)),
color=color_CNN_no_phot,
linewidth=linewidth,
)
if with_RF_no_photoz:
plt.plot(recall_RF_no_photoz, precision_RF_no_photoz,
label=label_RF_no_photoz + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_RF_no_photoz.target, df_RF_no_photoz.prob)),
color=color_RF_no_photoz,
linewidth=linewidth,
)
if with_RF_with_vgg_more:
plt.plot(recall_RF_with_vgg_more, precision_RF_with_vgg_more,
label=label_RF_with_vgg_more + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_RF_with_vgg_more.target, df_RF_with_vgg_more.prob)),
color=color_RF_with_vgg_more,
linewidth=linewidth,
)
if with_RF_with_vgg:
plt.plot(recall_RF_with_vgg, precision_RF_with_vgg,
label=label_RF_with_vgg + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_RF_with_vgg.target, df_RF_with_vgg.prob)),
color=color_RF_with_vgg,
linewidth=linewidth,
)
if with_photoz:
plt.plot(recall_LR_photoz, precision_LR_photoz,
label=label_photoz + "\n(AUC={:.2f})".format(sklearn.metrics.average_precision_score(df_LR_photoz.target, df_LR_photoz.prob)),
color=color_photoz,
linewidth=linewidth,
)
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel("completeness")
plt.ylabel("purity")
plt.axhline(target_train.values.mean(), linestyle="dashed",
label="Random guessing\n(AUC={:.2f})".format(target_train.values.mean()),
color="black",
linewidth=linewidth,
)
plt.legend(loc="best")
In [53]:
plot_PR_curves(
with_RF=True, with_RF_with_vgg=True,
with_RF_with_vgg_more=True,
with_CNN=True, with_CNN_no_phot=False,
with_RF_no_photoz=True, with_photoz=True,
)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
Out[53]:
In [54]:
plot_PR_curves(with_RF=True, with_CNN=True,
with_CNN_no_phot=True)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "PR-RF_CNN_CNN_nophot"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [55]:
plot_PR_curves(with_RF=True, with_CNN=True, with_CNN_no_phot=True)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "PR-CNN_and_CNN_no_photometry"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [56]:
plot_PR_curves(with_RF=True, with_photoz=True)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "PR-RF_LRz"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
# plt.legend(loc="upper left", bbox_to_anchor=(1,1))
In [57]:
plot_PR_curves(with_RF=True, with_CNN=True,
with_RF_with_vgg_more=True)
leg = plt.legend()
labels = [t.get_text() for t in leg.get_texts()]
plt.legend(
[labels[0],
labels[1],
"RF with VGG-19\n("+ labels[2].split("(")[-1],
labels[3],
],
loc="upper right")
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "PR-RF_RF_vgg-only_good_CNN"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
# plt.legend(loc="upper left", bbox_to_anchor=(1,1))
In [58]:
plot_PR_curves(with_RF=True,
with_CNN=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "PR-RF_and_CNN"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [59]:
plot_PR_curves(
with_RF=True,
with_CNN=True,
with_RF_with_vgg_more=True,
with_RF_with_vgg=True,
)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.tight_layout(rect=(0,0,1,.8))
plot_filename = pathlib.Path("plots_for_thesis") / "PR-RF_and_RFvgg"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [60]:
plot_PR_curves(
with_RF=True,
with_RF_no_photoz=True,
with_photoz=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "PR-RF_RFnoz_LRz"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [61]:
theoretical_probs=np.linspace(0,1,num=21)
In [62]:
empirical_probs_RF = np.empty(theoretical_probs.size-1)
empirical_probs_RF_with_vgg = np.empty_like(empirical_probs_RF)
empirical_probs_RF_with_vgg_more = np.empty_like(empirical_probs_RF)
empirical_probs_CNN = np.empty_like(empirical_probs_RF)
empirical_probs_CNN_no_phot = np.empty_like(empirical_probs_RF)
empirical_probs_RF_no_photoz = np.empty_like(empirical_probs_RF)
empirical_probs_photoz = np.empty_like(empirical_probs_RF)
In [63]:
for i in range(theoretical_probs.size-1):
prob_lim_low = theoretical_probs[i]
prob_lim_high = theoretical_probs[i+1]
mask_RF = (df_RF.prob >= prob_lim_low) & (df_RF.prob < prob_lim_high)
empirical_probs_RF[i] = df_RF.target[mask_RF].mean()
mask_RF_with_vgg = (df_RF_with_vgg.prob >= prob_lim_low) & (df_RF_with_vgg.prob < prob_lim_high)
empirical_probs_RF_with_vgg[i] = df_RF_with_vgg.target[mask_RF_with_vgg].mean()
mask_RF_with_vgg_more = (df_RF_with_vgg_more.prob >= prob_lim_low) & (df_RF_with_vgg_more.prob < prob_lim_high)
empirical_probs_RF_with_vgg_more[i] = df_RF_with_vgg_more.target[mask_RF_with_vgg_more].mean()
mask_CNN_no_phot = (df_cnn[df_cnn.testing].CNN_prob >= prob_lim_low) & (df_cnn[df_cnn.testing].CNN_prob < prob_lim_high)
empirical_probs_CNN_no_phot[i] = df_cnn[df_cnn.testing].target[mask_CNN_no_phot].mean()
mask_CNN = (df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob >= prob_lim_low) & (df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob < prob_lim_high)
empirical_probs_CNN[i] = df_cnn[df_cnn_with_photometry.testing].target[mask_CNN].mean()
mask_RF_no_photoz = (df_RF_no_photoz.prob >= prob_lim_low) & (df_RF_no_photoz.prob < prob_lim_high)
empirical_probs_RF_no_photoz[i] = df_RF_no_photoz.target[mask_RF_no_photoz].mean()
mask_photoz = (df_LR_photoz.prob >= prob_lim_low) & (df_LR_photoz.prob < prob_lim_high)
empirical_probs_photoz[i] = df_LR_photoz.target[mask_photoz].mean()
In [64]:
def plot_probability_calibration(with_RF=False,
with_RF_with_vgg=False,
with_RF_with_vgg_more=False,
with_CNN=False,
with_CNN_no_phot=False,
with_RF_no_photoz=False,
with_photoz=False,
):
if with_RF:
plt.step(theoretical_probs, [empirical_probs_RF[0], *empirical_probs_RF],
linestyle="steps", color=color_RF, label=label_RF)
if with_CNN:
plt.step(theoretical_probs, [empirical_probs_CNN[0], *empirical_probs_CNN],
linestyle="steps", color=color_CNN, label=label_CNN)
if with_CNN_no_phot:
plt.step(theoretical_probs, [empirical_probs_CNN_no_phot[0], *empirical_probs_CNN_no_phot],
linestyle="steps", color=color_CNN_no_phot, label=label_CNN_no_phot)
if with_RF_no_photoz:
plt.step(theoretical_probs, [empirical_probs_RF_no_photoz[0], *empirical_probs_RF_no_photoz],
linestyle="steps", color=color_RF_no_photoz, label=label_RF_no_photoz)
if with_RF_with_vgg_more:
plt.step(theoretical_probs, [empirical_probs_RF_with_vgg_more[0], *empirical_probs_RF_with_vgg_more],
linestyle="steps",
color=color_RF_with_vgg_more,
label=label_RF_with_vgg_more,
)
if with_RF_with_vgg:
plt.step(theoretical_probs, [empirical_probs_RF_with_vgg[0], *empirical_probs_RF_with_vgg],
linestyle="steps", color=color_RF_with_vgg, label=label_RF_with_vgg)
if with_photoz:
plt.step(theoretical_probs, [empirical_probs_photoz[0], *empirical_probs_photoz],
linestyle="steps", color=color_photoz, label=label_photoz)
plt.fill_between(theoretical_probs, theoretical_probs-theoretical_probs[1], theoretical_probs,
step="pre", color="black", label="ideal", alpha=.2,
# linewidth=linewidth,
)
plt.xlabel("Reported Probability")
plt.ylabel("Actual (Binned) Probability")
plt.legend(loc="best")
plt.xlim(0,1)
# plt.ylim(0,1)
In [65]:
plot_probability_calibration(
with_RF=True,
with_RF_with_vgg=True,
with_RF_with_vgg_more=True,
with_CNN=True,
with_CNN_no_phot=True,
with_RF_no_photoz=True,
with_photoz=True,
)
In [66]:
plot_probability_calibration(
with_RF=True,
with_CNN=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "probability_calibration-RF_and_CNN"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [67]:
plot_probability_calibration(
with_RF=True, with_CNN=True, with_CNN_no_phot=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "probability_calibration-CNN_and_CNN_no_photometry"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [68]:
plot_probability_calibration(
with_RF=True, with_RF_no_photoz=True, with_photoz=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "probability_calibration-RF_RFnoz_LRz"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [69]:
plot_probability_calibration(
with_RF=True,
with_CNN=True,
with_RF_with_vgg_more=True,
with_RF_with_vgg=True,
)
plt.tight_layout()
plot_filename = pathlib.Path("plots_for_thesis") / "probability_calibration-RF_and_RFvgg"
plt.savefig(plot_filename.with_suffix(".png"))
plt.savefig(plot_filename.with_suffix(".pdf"))
In [70]:
combined.set_index("catalog_2_ids").loc[43158176442374445]
Out[70]:
In [71]:
sklearn.metrics.log_loss(df_RF.target, df_RF.prob)
Out[71]:
In [72]:
sklearn.metrics.log_loss(df_RF_no_photoz.target, df_RF_no_photoz.prob)
Out[72]:
In [73]:
sklearn.metrics.log_loss(df_LR_photoz.target, df_LR_photoz.prob)
Out[73]:
In [74]:
sklearn.metrics.log_loss(df_RF_with_vgg.target, df_RF_with_vgg.prob)
Out[74]:
In [75]:
sklearn.metrics.log_loss(df_RF_with_vgg_more.target, df_RF_with_vgg_more.prob)
Out[75]:
In [76]:
sklearn.metrics.log_loss(df_cnn_with_photometry[df_cnn_with_photometry.testing].target,
df_cnn_with_photometry[df_cnn_with_photometry.testing].CNN_prob,)
Out[76]:
In [77]:
sklearn.metrics.log_loss(df_cnn[df_cnn.testing].target,
df_cnn[df_cnn.testing].CNN_prob,)
Out[77]:
In [ ]: